import sys

from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
# 加载 .env 到环境变量
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
# 加载文档
loader = PyPDFLoader("llama2.pdf")
pages = loader.load_and_split()

# 文档切分
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
)

texts = text_splitter.create_documents(
    [page.page_content for page in pages[:4]]
)

# 灌库
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(texts, embeddings)

# 检索 top-1 结果
retriever = db.as_retriever(search_kwargs={"k": 1})

docs = retriever.get_relevant_documents("llama2 chat有多少参数")

print(docs[0].page_content)
'''
文档处理部分 LangChain 实现较为粗糙，实际生产中不建议使用
与向量数据库的链接部分本质是接口封装，向量数据库需要自己选型
'''